Los datos están relacionados con campañas de marketing directo (llamadas telefónicas) de una institución bancaria portuguesa. El objetivo de la clasificación es predecir si el cliente suscribirá un depósito a plazo (variable y). Los datos están relacionados con campañas de marketing directo de una institución bancaria portuguesa. A menudo, se requería más de un contacto con el mismo cliente, para poder acceder si el producto (depósito bancario a plazo) estaría ('sí') o no ('no') suscrito. Por tanto, el objetivo de la clasificación es predecir si el cliente suscribirá (sí/no) un depósito a plazo (variable y).
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
sns.set_theme(color_codes=True)
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("bank-additional-full.csv", delimiter=";")
pd.set_option("display.max_columns", None)
df.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | 261 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | 149 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | 226 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | 151 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
| 4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | 307 | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | no |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41188 entries, 0 to 41187 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 41188 non-null int64 1 job 41188 non-null object 2 marital 41188 non-null object 3 education 41188 non-null object 4 default 41188 non-null object 5 housing 41188 non-null object 6 loan 41188 non-null object 7 contact 41188 non-null object 8 month 41188 non-null object 9 day_of_week 41188 non-null object 10 duration 41188 non-null int64 11 campaign 41188 non-null int64 12 pdays 41188 non-null int64 13 previous 41188 non-null int64 14 poutcome 41188 non-null object 15 emp.var.rate 41188 non-null float64 16 cons.price.idx 41188 non-null float64 17 cons.conf.idx 41188 non-null float64 18 euribor3m 41188 non-null float64 19 nr.employed 41188 non-null float64 20 y 41188 non-null object dtypes: float64(5), int64(5), object(11) memory usage: 6.6+ MB
df.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
#Seleccionar datos categoricos
df_categoricos = df[["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week",
"poutcome", "y"]]
df_categoricos.head()
| job | marital | education | default | housing | loan | contact | month | day_of_week | poutcome | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | nonexistent | no |
| 1 | services | married | high.school | unknown | no | no | telephone | may | mon | nonexistent | no |
| 2 | services | married | high.school | no | yes | no | telephone | may | mon | nonexistent | no |
| 3 | admin. | married | basic.6y | no | no | no | telephone | may | mon | nonexistent | no |
| 4 | services | married | high.school | no | no | yes | telephone | may | mon | nonexistent | no |
#Seleccionar datos númericos
df_numericos = df[["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]]
df_numericos.head()
| age | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | 261 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 1 | 57 | 149 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 2 | 37 | 226 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 3 | 40 | 151 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
| 4 | 56 | 307 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 |
# Documentar sns.countplot: https://seaborn.pydata.org/generated/seaborn.countplot.html
cat_vars = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]
#Crear figuras con subplots
fig, axs = plt.subplots(nrows=2, ncols=5, figsize = (20, 10))
axs = axs.flatten()
#Crear un countplot para cada variable categorica
for i, var in enumerate (cat_vars):
sns.countplot(x=var, hue="y", data = df_categoricos, ax=axs[i])
axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation = 90)
#Ajustar espacio entre subplots
fig.tight_layout()
#Mostrar el plot
plt.show()
# Documentar seaborn.histplot: https://seaborn.pydata.org/generated/seaborn.histplot.html
#Lista de variables categoricas
cat_vars = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]
#Crear figuras con subplots
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
#Crear histogramas para cada variable categorica
for i, var in enumerate (cat_vars):
sns.histplot(x=var, hue="y", data = df_categoricos, ax=axs[i], multiple = "fill", kde = False, element = "bars", fill= True, stat = "density")
axs[i].set_xticklabels(df_categoricos[var].unique(), rotation=90)
axs[i].set_xlabel(var)
#Ajustar el especio entre subplots
fig.tight_layout()
#Mostrar el plot
plt.show()
La mayoría de las personas que suscriben los depósitos bancarios a plazo son: jubilados y estudiantes.
La mayoría de las personas que suscriben los depósitos bancarios a plazo son cantactados por vía celular.
La mayoría de las personas que suscriben los depósitos bancarios a plazo tienen su último contacto en: octubre, diciembre, marzo, septiembre
La mayoría de las personas que suscriben el depósito bancario a plazo han valorado exitosamente la campaña de marketing.
# Documentar sns.boxplot: https://seaborn.pydata.org/generated/seaborn.boxplot.html
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.boxplot(x=var, data=df, ax=axs[i])
fig.tight_layout()
plt.show()
# Documentar sns.violinplot: https://seaborn.pydata.org/generated/seaborn.violinplot.html
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.violinplot(x=var, data=df, ax=axs[i])
fig.tight_layout()
plt.show()
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.violinplot(x=var, y="y", data=df, ax=axs[i])
fig.tight_layout()
plt.show()
num_vars = ["age", "duration", "campaign", "pdays", "previous", "emp.var.rate", "cons.price.idx",
"cons.conf.idx", "euribor3m", "nr.employed"]
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.histplot(x=var, data=df, ax=axs[i])
fig.tight_layout()
plt.show()
num_vars = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed']
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
axs = axs.flatten()
for i, var in enumerate(num_vars):
sns.histplot(x=var, hue='y', data=df, ax=axs[i], multiple="stack")
fig.tight_layout()
plt.show()
# Documentar sns.pairplot: https://seaborn.pydata.org/generated/seaborn.pairplot.html
# Listado de variables númericas.
num_vars = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
'cons.conf.idx', 'euribor3m', 'nr.employed']
# Crear una Matriz para los diagramas de dispersión
sns.pairplot(df, hue='y')
<seaborn.axisgrid.PairGrid at 0x1d826033ca0>
Devuelve valores únicos de una serie de objetos.
Los valor únicos se devuelven en orden de aparición. Los valores Únicos se basan en tablas hash, por lo tanto, NO se ordenan.
https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html
df['job'].unique()
array(['housemaid', 'services', 'admin.', 'blue-collar', 'technician',
'retired', 'management', 'unemployed', 'self-employed', 'unknown',
'entrepreneur', 'student'], dtype=object)
df['marital'].unique()
array(['married', 'single', 'divorced', 'unknown'], dtype=object)
df['education'].unique()
array(['basic.4y', 'high.school', 'basic.6y', 'basic.9y',
'professional.course', 'unknown', 'university.degree',
'illiterate'], dtype=object)
df['default'].unique()
array(['no', 'unknown', 'yes'], dtype=object)
df['housing'].unique()
array(['no', 'yes', 'unknown'], dtype=object)
df['loan'].unique()
array(['no', 'yes', 'unknown'], dtype=object)
df['contact'].unique()
array(['telephone', 'cellular'], dtype=object)
df['month'].unique()
array(['may', 'jun', 'jul', 'aug', 'oct', 'nov', 'dec', 'mar', 'apr',
'sep'], dtype=object)
df['day_of_week'].unique()
array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object)
df['poutcome'].unique()
array(['nonexistent', 'failure', 'success'], dtype=object)
df['y'].unique()
array(['no', 'yes'], dtype=object)
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['job']= label_encoder.fit_transform(df['job'])
df['job'].unique()
array([ 3, 7, 0, 1, 9, 5, 4, 10, 6, 11, 2, 8])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['marital']= label_encoder.fit_transform(df['marital'])
df['marital'].unique()
array([1, 2, 0, 3])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['education']= label_encoder.fit_transform(df['education'])
df['education'].unique()
array([0, 3, 1, 2, 5, 7, 6, 4])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['default']= label_encoder.fit_transform(df['default'])
df['default'].unique()
array([0, 1, 2])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['housing']= label_encoder.fit_transform(df['housing'])
df['housing'].unique()
array([0, 2, 1])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['loan']= label_encoder.fit_transform(df['loan'])
df['loan'].unique()
array([0, 2, 1])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['contact']= label_encoder.fit_transform(df['contact'])
df['contact'].unique()
array([1, 0])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['month']= label_encoder.fit_transform(df['month'])
df['month'].unique()
array([6, 4, 3, 1, 8, 7, 2, 5, 0, 9])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['day_of_week']= label_encoder.fit_transform(df['day_of_week'])
df['day_of_week'].unique()
array([1, 3, 4, 2, 0])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['poutcome']= label_encoder.fit_transform(df['poutcome'])
df['poutcome'].unique()
array([1, 0, 2])
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['y']= label_encoder.fit_transform(df['y'])
df['y'].unique()
array([0, 1])
df.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 56 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 261 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 1 | 57 | 7 | 1 | 3 | 1 | 0 | 0 | 1 | 6 | 1 | 149 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 2 | 37 | 7 | 1 | 3 | 0 | 2 | 0 | 1 | 6 | 1 | 226 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 3 | 40 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 6 | 1 | 151 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
| 4 | 56 | 7 | 1 | 3 | 0 | 0 | 2 | 1 | 6 | 1 | 307 | 1 | 999 | 0 | 1 | 1.1 | 93.994 | -36.4 | 4.857 | 5191.0 | 0 |
"Y" Label
sns.countplot(df['y'])
df['y'].value_counts()
0 36548 1 4640 Name: y, dtype: int64
from sklearn.utils import resample
#Crear dos diferentes dataframe de una clase mayoritaria y minoritaria
df_majority = df[(df['y']==0)]
df_minority = df[(df['y']==1)]
# muestreo ascendente de la clase minoritaria
df_minority_upsampled = resample(df_minority,
replace=True, # muesta con reemplazo
n_samples= 36548, # para que coincida con la clase mayoritaria
random_state=0) # resultados reproducible
# Combinar la clase mayoritaria con la muestra ascendente de la clase minoritaria
df_upsampled = pd.concat([df_minority_upsampled, df_majority])
sns.countplot(df_upsampled['y'])
df_upsampled['y'].value_counts()
1 36548 0 36548 Name: y, dtype: int64
Detectar outlier es tedioso, especialmente cuando se tienen multiples tipos de datos.
Por lo tanto, tenemos diferentes formas de detectar valores atípicos para diferentes tipos de datos.
En cuanto a los datos distribuidos normalmente, podemos obtener el método Z-Score;
Para skewed data, se usa IQR.
def remove_outliers_iqr(df, columns):
for col in columns:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
return df
# Señale las columnas para remover los outliers
columns_to_check = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
'euribor3m', 'nr.employed']
# Solicitar la función que remueve los outliers usando IQR
df_clean = remove_outliers_iqr(df_upsampled, columns_to_check)
# Mostrar el resultado en el dataframe
df_clean.head()
| age | job | marital | education | default | housing | loan | contact | month | day_of_week | duration | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 37017 | 25 | 8 | 2 | 7 | 1 | 2 | 0 | 0 | 3 | 3 | 371 | 1 | 999 | 0 | 1 | -2.9 | 92.469 | -33.6 | 1.044 | 5076.2 | 1 |
| 36682 | 51 | 9 | 2 | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 657 | 1 | 999 | 0 | 1 | -2.9 | 92.963 | -40.8 | 1.268 | 5076.2 | 1 |
| 29384 | 45 | 7 | 2 | 7 | 0 | 0 | 0 | 1 | 0 | 0 | 541 | 1 | 999 | 0 | 1 | -1.8 | 93.075 | -47.1 | 1.405 | 5099.1 | 1 |
| 21998 | 29 | 9 | 2 | 3 | 1 | 0 | 0 | 0 | 1 | 4 | 921 | 3 | 999 | 0 | 1 | 1.4 | 93.444 | -36.1 | 4.964 | 5228.1 | 1 |
| 16451 | 37 | 10 | 2 | 2 | 1 | 2 | 2 | 0 | 3 | 4 | 633 | 1 | 999 | 0 | 1 | 1.4 | 93.918 | -42.7 | 4.963 | 5228.1 | 1 |
df_clean.shape
(49702, 21)
Seaborn es una biblioteca de python que permite hacer mejores gráficos fácilmente gracias a su función heatmap(). Un mapa de calor es una representación gráfica de datos donde cada valor de una matriz se representa como un color.
plt.figure(figsize=(20, 16))
sns.heatmap(df_clean.corr(), fmt='.2g', annot=True)
<AxesSubplot:>
X = df_clean.drop('y', axis=1)
y = df_clean['y']
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
Para ser precisos, el método split() genera los índices de entrenamiento y prueba, no los datos en si mismos.
Tener múltiples divisiones puede ser útil si desea estimar mejor el rendimiento de su modelo.
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=0)
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
clf_gini.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=3, random_state=0)
#Hacemos predicción con CLF Gini
y_pred_gini = clf_gini.predict(X_test)
#Estimamos precisión
print('Precisión en el set de Entrenamiento: {:.2f}'
.format(clf_gini.score(X_train, y_train)))
print('Precisión en el set de Test: {:.2f}'
.format(clf_gini.score(X_test, y_test)))
Precisión en el set de Entrenamiento: 0.85 Precisión en el set de Test: 0.85
Accurracy score con criterio Gini Index
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, jaccard_score
print('F-1 Score : ',(f1_score(y_test, y_pred_gini, average='micro')))
print('Precision Score : ',(precision_score(y_test, y_pred_gini, average='micro')))
print('Recall Score : ',(recall_score(y_test, y_pred_gini, average='micro')))
print('Jaccard Score : ',(jaccard_score(y_test, y_pred_gini, average='micro')))
F-1 Score : 0.8468915565689761 Precision Score : 0.846891556568976 Recall Score : 0.846891556568976 Jaccard Score : 0.7344422472955682
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
print (classification_report(y_test, y_pred_gini))
precision recall f1-score support
0 0.87 0.87 0.87 8875
1 0.81 0.81 0.81 6036
accuracy 0.85 14911
macro avg 0.84 0.84 0.84 14911
weighted avg 0.85 0.85 0.85 14911
# Matriz de confusión
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_gini)
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='Blues')
plt.figure(figsize=(9,9))
<Figure size 900x900 with 0 Axes>
<Figure size 900x900 with 0 Axes>
plt.figure(figsize=(12,8))
from sklearn import tree
tree.plot_tree(clf_gini.fit(X_train, y_train))
[Text(0.5, 0.875, 'X[10] <= 362.5\ngini = 0.48\nsamples = 34791\nvalue = [20859, 13932]'), Text(0.25, 0.625, 'X[18] <= 3.168\ngini = 0.34\nsamples = 22575\nvalue = [17663, 4912]'), Text(0.125, 0.375, 'X[19] <= 5087.65\ngini = 0.495\nsamples = 8226\nvalue = [3702, 4524]'), Text(0.0625, 0.125, 'gini = 0.352\nsamples = 3618\nvalue = [825, 2793]'), Text(0.1875, 0.125, 'gini = 0.469\nsamples = 4608\nvalue = [2877, 1731]'), Text(0.375, 0.375, 'X[8] <= 7.5\ngini = 0.053\nsamples = 14349\nvalue = [13961, 388]'), Text(0.3125, 0.125, 'gini = 0.03\nsamples = 14167\nvalue = [13949, 218]'), Text(0.4375, 0.125, 'gini = 0.123\nsamples = 182\nvalue = [12, 170]'), Text(0.75, 0.625, 'X[10] <= 525.5\ngini = 0.386\nsamples = 12216\nvalue = [3196, 9020]'), Text(0.625, 0.375, 'X[18] <= 2.916\ngini = 0.492\nsamples = 4384\nvalue = [1913, 2471]'), Text(0.5625, 0.125, 'gini = 0.309\nsamples = 2083\nvalue = [398, 1685]'), Text(0.6875, 0.125, 'gini = 0.45\nsamples = 2301\nvalue = [1515, 786]'), Text(0.875, 0.375, 'X[18] <= 1.402\ngini = 0.274\nsamples = 7832\nvalue = [1283, 6549]'), Text(0.8125, 0.125, 'gini = 0.136\nsamples = 2315\nvalue = [170, 2145]'), Text(0.9375, 0.125, 'gini = 0.322\nsamples = 5517\nvalue = [1113, 4404]')]
#Instalar Graphviz en Python pip install graphviz
import graphviz
import pydotplus
%matplotlib inline
dot_data = tree.export_graphviz(clf_gini, out_file=None, max_depth=None,
feature_names=X_train.columns,
class_names=True,
filled=True, rotate=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
from sklearn.tree import DecisionTreeClassifier
clf_en = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
clf_en.fit(X_train, y_train)
DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
Predecir con test set para el criterio entropy
#Hacemos predicciones
y_pred_en = clf_en.predict(X_test)
print('Precisión en el set de Entrenamiento: {:.2f}'
.format(clf_en.score(X_train, y_train)))
print('Precisión en el set de Test: {:.2f}'
.format(clf_en.score(X_test, y_test)))
Precisión en el set de Entrenamiento: 0.85 Precisión en el set de Test: 0.85
Accurracy score con criterio Entropy
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, jaccard_score
print('F-1 Score : ',(f1_score(y_test, y_pred_en, average='micro')))
print('Precision Score : ',(precision_score(y_test, y_pred_en, average='micro')))
print('Recall Score : ',(recall_score(y_test, y_pred_en, average='micro')))
print('Jaccard Score : ',(jaccard_score(y_test, y_pred_en, average='micro')))
F-1 Score : 0.8468915565689761 Precision Score : 0.846891556568976 Recall Score : 0.846891556568976 Jaccard Score : 0.7344422472955682
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
print (classification_report(y_test, y_pred_en))
precision recall f1-score support
0 0.87 0.87 0.87 8875
1 0.81 0.81 0.81 6036
accuracy 0.85 14911
macro avg 0.84 0.84 0.84 14911
weighted avg 0.85 0.85 0.85 14911
# Matriz de confusión
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_en)
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
index=['Predict Positive:1', 'Predict Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='Reds')
plt.figure(figsize=(9,9))
<Figure size 900x900 with 0 Axes>
<Figure size 900x900 with 0 Axes>
plt.figure(figsize=(12,8))
from sklearn import tree
tree.plot_tree(clf_en.fit(X_train, y_train))
[Text(0.5, 0.875, 'X[10] <= 362.5\nentropy = 0.971\nsamples = 34791\nvalue = [20859, 13932]'), Text(0.25, 0.625, 'X[18] <= 3.168\nentropy = 0.756\nsamples = 22575\nvalue = [17663, 4912]'), Text(0.125, 0.375, 'X[19] <= 5087.65\nentropy = 0.993\nsamples = 8226\nvalue = [3702, 4524]'), Text(0.0625, 0.125, 'entropy = 0.775\nsamples = 3618\nvalue = [825, 2793]'), Text(0.1875, 0.125, 'entropy = 0.955\nsamples = 4608\nvalue = [2877, 1731]'), Text(0.375, 0.375, 'X[8] <= 7.5\nentropy = 0.179\nsamples = 14349\nvalue = [13961, 388]'), Text(0.3125, 0.125, 'entropy = 0.115\nsamples = 14167\nvalue = [13949, 218]'), Text(0.4375, 0.125, 'entropy = 0.351\nsamples = 182\nvalue = [12, 170]'), Text(0.75, 0.625, 'X[10] <= 525.5\nentropy = 0.829\nsamples = 12216\nvalue = [3196, 9020]'), Text(0.625, 0.375, 'X[18] <= 2.916\nentropy = 0.988\nsamples = 4384\nvalue = [1913, 2471]'), Text(0.5625, 0.125, 'entropy = 0.704\nsamples = 2083\nvalue = [398, 1685]'), Text(0.6875, 0.125, 'entropy = 0.926\nsamples = 2301\nvalue = [1515, 786]'), Text(0.875, 0.375, 'X[18] <= 1.402\nentropy = 0.643\nsamples = 7832\nvalue = [1283, 6549]'), Text(0.8125, 0.125, 'entropy = 0.379\nsamples = 2315\nvalue = [170, 2145]'), Text(0.9375, 0.125, 'entropy = 0.725\nsamples = 5517\nvalue = [1113, 4404]')]
dot_data = tree.export_graphviz(clf_en, out_file=None, max_depth=None,
feature_names=X_train.columns,
class_names=True,
filled=True, rotate=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph